We simulate some data generating processes. Then we fit random forests in samples from the simulated DGP.
#191231 script1
#preilminaries ----------------------
library(tidyverse)
library(randomForest)
library(sp)
library(magick) # to make gifs
In this session we make a DGP for classification problem. We build a region which is a circle, inside the circle the class is 1 outside the class is 0. The total region is a square of 10 by 10. The circle have center (5,5) and radius 3.
#simulate train test sets -----------------------------
#simulate a dgp for classification problem a circle in the middle of of a square area
#1st
#simulate a uniform random distribution in 2d
#training set
set.seed(1)
x = runif(n=1000,min=0,max=10)
y = runif(n=1000,min=0,max=10)
df = data.frame(x,y)
z = apply(df,1,function(ii){if((ii[1]-5)^2+(ii[2]-5)^2<=9){1} else 0})
df$z = factor(z)
str(df)
## 'data.frame': 1000 obs. of 3 variables:
## $ x: num 2.66 3.72 5.73 9.08 2.02 ...
## $ y: num 5.31 6.85 3.83 9.55 1.18 ...
## $ z: Factor w/ 2 levels "0","1": 2 2 2 1 1 1 1 2 1 1 ...
#an article to finish:
#https://towardsdatascience.com/learn-classification-with-random-forests-in-r-998d3a734098
#test set
set.seed(2)
x = runif(n=1000,min=0,max=10)
y = runif(n=1000,min=0,max=10)
df_test = data.frame(x,y)
z = apply(df_test,1,function(ii){if((ii[1]-5)^2+(ii[2]-5)^2<=9){1} else 0})
df_test$z = factor(z)
set.seed(1)
model = randomForest(formula =z~x+y, data=df, ntree=100, mtry=1,nodesize=1)
#if ntry = number of covariates then the random forest is a bagging of trees
model
##
## Call:
## randomForest(formula = z ~ x + y, data = df, ntree = 100, mtry = 1, nodesize = 1)
## Type of random forest: classification
## Number of trees: 100
## No. of variables tried at each split: 1
##
## OOB estimate of error rate: 2.5%
## Confusion matrix:
## 0 1 class.error
## 0 722 13 0.01768707
## 1 12 253 0.04528302
zh = predict(model,df_test)
(df$z == zh) %>% mean #accuracy
## [1] 0.615
#low accuracy in test data
ggplot() +
geom_point(data=df, aes(y=y, x=x, color=z),size=3,alpha=0.7)+
ggtitle("Simulated classification problem")
#training data distribution on 2d.
#create circle gdp's frontier -----------------------
x_1st_part = seq(2,8,by=.1)
x_2nd_part = seq(8,2,by=-.1)
x_coord = c(x_1st_part,x_2nd_part)
plot(x_coord)
y_coord = c((9-(x_1st_part-5)^2)^.5+5,-(9-(x_2nd_part-5)^2)^.5+5)
plot(y_coord)
plot(x_coord, y_coord)
xym = cbind(x_coord, y_coord)
p = Polygon(xym)
ps = Polygons(list(p),1)
sps = SpatialPolygons(list(ps))
plot(sps)
circle_fortify = fortify(sps)
ggplot() +
geom_point(data=df, aes(y=y, x=x, color=z),size=3,alpha=0.7)+
geom_polygon(data=circle_fortify,aes(long,lat), fill = NA, color = "black") +
coord_equal()+
ggtitle("Simulated classification problem")
#analyze prediction ---------------------------------
x = seq(0,10,by=10/70)
y = seq(0,10,by=10/70)
plot(x,y)
df_sim = expand.grid(x = x, y = y)
df_sim %>% class
## [1] "data.frame"
df_sim %>% dim
## [1] 5041 2
plot(df_sim$x,df_sim$y)
zh_sim = predict(model,df_sim)
df_sim$z = zh_sim
ggplot() +
geom_polygon(data=circle_fortify,aes(long,lat), fill = NA, color = "black") +
geom_point(data=df_sim, aes(y=y, x=x, color=z),size=5,alpha=0.7)+
ggtitle("Simulated ntree=100")
#there are some rough edges...
#this model was built with ntree=100, mtry=1,nodesize=1
#function of graph and accuracy ---------------
plot_rf = function(model, ntree=100,mtry=2,nodesize=1,graph=TRUE){
output = list()
set.seed(1)
model = randomForest(formula =z~x+y, data=df, ntree=ntree, mtry=mtry, nodesize=nodesize)
zh = predict(model,df_test)
accuracy = mean(df_test$z==zh) #accuracy
zh_sim = predict(model,df_sim)
df_sim$z = zh_sim
if(graph==TRUE){
plot_ggplot = ggplot() +
geom_point(data=df_sim, aes(y=y, x=x, color=z),size=3,alpha=0.7)+
geom_polygon(data=circle_fortify,aes(long,lat), fill = NA, color = "black") +
ggtitle(paste0("Simulated ",'ntree=',ntree,";nodesize=",nodesize,";accuracy=", accuracy))
output[[1]] = plot_ggplot
}
output[[2]] = accuracy
output
}
plot_rf(model,ntree=100,mtry=1,nodesize=1)[[1]]
#this is the cast of the bagging model
plot_rf(model,ntree=100,mtry=2,nodesize=1)
## [[1]]
##
## [[2]]
## [1] 0.978
The only difference between this graph and the previous one is mtry. Here mtry=2, before mtry=1. The only difference between a Random Forest and Bagging of Trees is the quantity of variables that are used in the each step of tree building is smaller.
In our case we have only 2 variables for the algorithm to choose. But in cases where there are many more variables, limiting and randomly assigning variables for the random forest to choose improves the model’s prediction out of sample. That is, restrincting the algorithm’s choice decrease variance.
# random forest with just 1 tree -------------------
plot_rf(model,ntree=1,mtry=2,nodesize=1)
## [[1]]
##
## [[2]]
## [1] 0.963
When we use 1 tree the model performs worse, as expected the edges are rougher this is a full tree but with a bootstrap sample the tree is grown till it reacher a nodesize of 1.
#graph showing how accuracy changes with ntree -----------------
tree_seq = c(1:50,seq(50,300,by=5))
len1 = length(tree_seq)
accuracy_seq = rep(NA,len1)
for (ii in 1:len1) {
ntree = tree_seq[ii]
accuracy = plot_rf(model,ntree=ntree,mtry=1,nodesize=1,graph=FALSE)[[2]]
accuracy_seq[ii] = accuracy
}
accuracy_seq_bag = rep(NA,len1)
for (ii in 1:len1) {
ntree = tree_seq[ii]
accuracy = plot_rf(model,ntree=ntree,mtry=2,nodesize=1,graph=FALSE)[[2]]
accuracy_seq_bag[ii] = accuracy
}
plot(tree_seq,accuracy_seq, type='l')
lines(tree_seq,accuracy_seq_bag,col='blue')
Using
mtry=1 is better as expected, random forests have in general better performance compared to bagging, in the test set the black line is mtry=1, blue line is mtry=2.
#animations ntree -------------------
#how fit changes with number of tree
tree_seq_gif = seq(1,30,by=1)
len2 = length(tree_seq_gif)
ii = 1
for (ii in 1:len2) {
ntree = tree_seq_gif[ii]
plot_rf(model,ntree=ntree,mtry=1,nodesize=1)[[1]]
ggsave(paste0("images/ntree/","ntree",100+ntree,".jpg"), width = 25, height = 25, units = "cm",dpi = 60)
}
#to create gifs
library(magick) # to make gifs
list.files(path = "./images/ntree", pattern = "*.jpg", full.names = T) %>%
map(image_read) %>% # reads each path file
image_join() %>% # joins image
image_animate(fps=2) %>% # animates, can opt for number of loops
image_write("ntree_change.gif") # write to current dir